/*
PROGRAM BY: JEFFREY WANG
Last edited 05/14/2021

This program creates a concordance between mat code from CMF-MAT and HS-10 code from LFTTD

*/ 

%include 'yyyy/pdata.sas';

options obs=max;
    

libname ecroot 'xxxx/imp_type/data';
%let input=xxxx/input_data ;
%let output=xxxx/data ;

%macro nmerge(l);
    %let li=%eval(&l+1) ;
    %let lo=%eval(&l-1)  ;
* sort to prepare for merge;
 proc sort data=hs_naics02;
    by naics&l;
 run;
 
  proc sort data=unmatched&li;
    by naics&l;
 run;

* collapse mnaics by naics for merge;
 proc means sum noprint data=unmatched&li;
   by naics&l ;
   var mc_total ;
   output out=mnaics_n&l(keep=naics&l mc_total) sum()= ;
 run;

    
* match by naics codes;
data mat_hs_naics_&l(drop=naics&lo) unmatched&l(keep=naics&lo mc_total);
  length naics&lo $ &lo ;
    merge mnaics_n&l(in=a) hs_naics02(in=b keep=naics&l hs naics_matchtype);
    by naics&l;
    if a=1;
    if b=1 then matched = 1;
    if b=1 then match_level = &l;
    naics&lo = substr(naics&l,1,&lo);
    if b=1 then output mat_hs_naics_&l;
    else output unmatched&l ;
run;
%mend;
        
%macro mat_hs_match(y);
* in 1992, use sic code instead of naics code;
%if &y. >= 1992 and &y. < 1997  %then %do;
 
data cmfmat&y.;
    set ecroot.cmf_mat_match1992;
run;
        
proc means data=cmfmat&y. NWAY NOPRINT;
    class m;
    var mc;
    output out=m_code sum = mc_total;
run;        

* merge normal codes with special codes;
* create a flag to indicate whether a code maps well with naics;
* naics_flag = C if complete, I if incomplete (less than NAICS6), and O if other (cant be matched at all);
data m_sic;
    set m_code;
    sic=substr(m,1,4) ;
    sic4=sic;
run;

proc import out=hs_sic datafile = "&input/hs_sic_naics_imports_89_117_20180927_old.dta" dbms=dta replace;
run;
    
* read data and only keep 2007 concordances;
* also extract 5,4,3-digit NAICS codes and their HS mappings;
data hs_sic02(keep = hs sic: sic_matchtype);
   length sic4 $ 4 sic3 $ 3  hs $ 10; 
    set hs_sic(rename=(commodity=hs10));
    sic4 = tranwrd(sic,'X','0');
    sic3=substr(sic4,1,3);
    hs2=hs10+10000000000;
    hs=substr(hs2,3,10);  * fill in the missing zeros;
    if year=&y.;
run;
       
*Merge by naics6 -n3 levels ;
data unmatched5;
  length naics6 $ 6 ;
  set m_sic (drop=sic4);
   sic4=sic;
run;    
     
* sort to prepare for merge;
 proc sort data=hs_sic02;
    by sic4;
 run;
 
  proc sort data=unmatched5;
    by sic4;
 run;

* collapse mnaics by sic for merge;
 proc means sum noprint data=unmatched5;
   by sic4 ;
   var mc_total ;
   output out=m_s4(keep=sic4 mc_total) sum()= ;
 run;

    
* match by naics codes;
data mat_hs_sic_4(drop=sic3) unmatched4(keep=sic3 mc_total);
  length sic3 $ 3 ;
    merge m_s4(in=a) hs_sic02(in=b keep=sic4 hs sic_matchtype);
    by sic4;
    if a=1;
    if b=1 then matched = 1;
    if b=1 then match_level = 4;
    sic3 = substr(sic4,1,3);
    if b=1 then output mat_hs_sic_4;
    else output unmatched4 ;
run;
        
* sort to prepare for merge;
 proc sort data=hs_sic02;
    by sic3;
 run;
 
  proc sort data=unmatched4;
    by sic3;
 run;

* collapse mnaics by sic for merge;
 proc means sum noprint data=unmatched4;
   by sic3 ;
   var mc_total ;
   output out=m_s3(keep=sic3 mc_total) sum()= ;
 run;

    
* match by naics codes;
data mat_hs_sic_3(drop=sic2) unmatched3(keep=sic2 mc_total);
    length sic2 $ 2 ;
    merge m_s3(in=a) hs_sic02(in=b keep=sic3 hs sic_matchtype);
    by sic3;
    if a=1;
    if b=1 then matched = 1;
    if b=1 then match_level = 3;
    sic2 = substr(sic3,1,2);
    if b=1 then output mat_hs_sic_3;
    else output unmatched3 ;
run;


*append all the datasets for mnaics codes that appear in data ;
data mat_hs_all ;
   set mat_hs_sic_4 mat_hs_sic_3;
   sic=sic4;
   if sic=" " & sic3~=" " then sic=sic3||"0" ;
run;        

*dataset with the mappings ;
data mat_code_bridge&y.;
    set mat_hs_all(keep=hs sic match_level);
    year = &y.;
run; 
%end;
        
%if &y.>=1997 %then %do;

%if &y.>=1997 and &y.<2002 %then %do;
data cmfmat&y.;
    set ecroot.cmf_mat_match1997;
run;
%end;

%if &y.>=2002 and &y.<2007 %then %do;
data cmfmat&y.;
    set ecroot.cmf_mat_match2002;
run;
%end;

%if &y.>=2007 and &y.<2012 %then %do;
data cmfmat&y.;
    set ecroot.cmf_mat_match2007;
run;
%end;

%if &y.>=2012 and &y.<2017 %then %do;
data cmfmat&y.;
    set ecroot.cmf_mat_match2012;
run;
%end;
* collapse to just keep all unique values of mnaics that;
* appear in cmfmat2007;
proc means data=cmfmat&y. NWAY NOPRINT;
    class mnaics;
    var mc;
    output out=mnaics_code sum = mc_total;
run;

* merge normal codes with special codes;
* create a flag to indicate whether a code maps well with naics;
* naics_flag = C if complete, I if incomplete (less than NAICS6), and O if other (cant be matched at all);
data mnaics_naics;
    set mnaics_code;
    naics=substr(mnaics,1,6) ;
    naics6=naics;
run;

proc sort data=mnaics_naics;
    by naics6;
run;


    
/*

Next match MNAICS with HS-10 using NAICS as a middle point
           
First match using naics6 which are supposed to be complete NAICS codes
Obs which are matched are done
Then match with NAICS-5,4,3
                                  
*/


* First import HS-NAICS concordances constructed by Pierce-Schott, from /projects/data/concordances;
* interested in classifying imports so will only use import data;
proc import out=hs_naics datafile = "&input/hs_sic_naics_imports_89_117_20180927_old.dta" dbms=dta replace;
run;
    
* read data and only keep 2007 concordances;
* also extract 5,4,3-digit NAICS codes and their HS mappings;
data hs_naics02(keep = hs naics: naics_matchtype);
   length naics6 $ 6 naics5 $ 5 naics4 $ 4 naics3 $ 3  hs $ 10; 
    set hs_naics(rename=(commodity=hs10));
    naics6 = tranwrd(naics,'X','0');
    naics5=substr(naics6,1,5);
    naics4=substr(naics6,1,4);
    naics3=substr(naics6,1,3);
    hs2=hs10+10000000000;
    hs=substr(hs2,3,10);  * fill in the missing zeros;
    if year=&y.;
run;
    

*Merge by naics6 -n3 levels ;
data unmatched7;
  length naics6 $ 6 ;
  set mnaics_naics (drop=naics6);
   naics6=naics; *I fixed this so naics has partials and the mappings from special ; 
run;

* here match to all naics6 values that are consistent with partials below;
 
%nmerge(6);
%nmerge(5);
%nmerge(4);
%nmerge(3);

*append all the datasets for mnaics codes that appear in data ;
data mat_hs_all ;
   set mat_hs_naics_6 mat_hs_naics_5 mat_hs_naics_4 mat_hs_naics_3 ;
   naics=naics6;
   if naics=" " & naics5~=" " then naics=naics5||"0" ;
   if naics=" " & naics4~=" " then naics=naics4||"00" ;
   if naics=" " & naics3~=" " then naics=naics3||"000" ;
run;   

*Analyze share of mat that is matched and match levels;
  *note that mc_total is unique by naics;
  *total value of unmatched is in unmatched3;
  
proc sort data=mat_hs_all; by naics; run;

data match_stats;
  set mat_hs_all;
  by naics;
  if first.naics=1;
run;

data match_stats2(keep=naics mc_total match_level matched);
  set match_stats unmatched3(in=b);
  if b=1 then matched=0;
  if b=1 then match_level=0;
run;

proc sort data=match_stats2; by match_level ; run;

*collapse by match level ;
proc means sum noprint data=match_stats2;
   by match_level; 
   var mc_total ;
   output out=match_stats3(rename=(_type_=mvar)) sum()= ;
run;


*total material costs ;
proc means sum noprint data=match_stats2;  
  var mc_total;
  output out=match_tots(rename=(_type_=mvar) drop=_freq_) sum(mc_total)=tot_mc;
run;  


*dataset with the mappings ;
data mat_code_bridge&y.;
    set mat_hs_all(keep=hs naics match_level);
    year = &y.;
run; 
%end;
%mend;

*do this for 2007;
%macro loop1();
    %do y=2007 %to 2007 %by 1;
        %mat_hs_match(&y.);
    %end;
%mend;
%loop1();

   data ecroot.mat_code_bridge_all;
        set mat_code_bridge2007;
    run;
        



/* 
*append everything together for all years;
%macro loop1();
    %do y=1992 %to 2016 %by 1;
        %mat_hs_match(&y.);
    %end;
%mend;
%loop1();

%macro loop2();
    data ecroot.mat_code_bridge_all;
        set mat_code_bridge1992;
    run;

    %do y=1993 %to 2016 %by 1;
        data ecroot.mat_code_bridge_all;
            set ecroot.mat_code_bridge_all mat_code_bridge&y.;
        run;
    %end;
%mend;
%loop2();
*/